In [1]:
import numpy as np
import shutil
import os
import json
import glob
from itertools import count
from scipy import sparse
import random
import timeit

In [2]:
def get_all_files(basedir, ext='.json'):
    '''
    Arguments 
    
    basedir: path to datase
    ext: file extension (default is .json)
    
    Returns
    
    files: ndarray contains filepaths to all files in the directory given by basedir
    filenames: dict with keys as filenames (without extension) and value as root path to the file
    '''
    filenames = {}
    files = np.array([])
    for root, _, fnames in os.walk(basedir):
        files = np.concatenate((files, glob.glob(os.path.join(root,'*'+ext))))
        for fn in fnames:
            filenames[os.path.splitext(fn)[0]] = root
    return files, filenames

In [3]:
def get_stochastic_matrix(files, t=0):
    '''
    Construct the similarity (column) stochastic matrix in COOordinate format
    
    Arguments
    t: threshold for similarity strength
    
    Return
    M: stochastic column matrix
    song_to_row: dict where key is track id of song and value is row index in matrix
    song_to_col: dict where key is track id of song and value is col index in matrix
    
    Note: Since M is square, for each song its corresponding row index equals the column index
    '''
    row = []
    col = []
    data = []
    song_to_row = {}
    song_to_col = {}
    row_count = count()
    col_count = count()
    
    for file in files:
        with open(file, 'r') as f:
            song = json.load(f)
            tid = song['track_id']
            if not tid in song_to_row:
                song_to_row[tid] = next(row_count)
            if not tid in song_to_col:
                song_to_col[tid] = next(col_count)
            out_degree = len(song['similars'])
            for s in song['similars']:
                sid = s[0]
                if not sid in song_to_col:
                    song_to_col[sid] = next(col_count)
                if not sid in song_to_row:
                    song_to_row[sid] = next(row_count)
                if(s[1] > t): # add edge only if the similarity score stricly greater than threshold t
                    row.append(song_to_row[tid])
                    col.append(song_to_col[sid])
                    if(out_degree == 0):
                        data.append(0)
                    else:
                        data.append(1.0/out_degree)
    M = sparse.coo_matrix((data, (row, col)), shape=(len(song_to_row), len(song_to_col))) # ensure that matrix is square
    M = M.T
    del row, col, data, row_count, col_count
    return M, song_to_row, song_to_col

In [4]:
def get_songs_to_tags_matrix(files, song_to_row, g=50):
    '''
    Construct the songs-to-tag matrix in COOordinate format
    
    Arguments
    tags_to_col: dict where key is tag and value is column index from matrix N
    g: threshold for tag strength
    
    Return
    N: songs-to-tags matrix
    
    '''
    row = []
    col = []
    data = []
    tags_to_col = {}
    col_count = count()
    
    for file in files:
        with open(file, 'r') as f:
            song = json.load(f)
            tid = song['track_id']
            for t in song['tags']:
                tag = t[0]
                cnt  = t[1]
                if not tag in tags_to_col:
                    tags_to_col[tag] = next(col_count)
                if(int(cnt) >= g):
                    row.append(song_to_row[tid])
                    col.append(tags_to_col[tag])
                    data.append(1)
    N = sparse.coo_matrix((data, (row, col)))
    del row, col, data, col_count
    return N, tags_to_col

In [5]:
def get_teleport_set(N, tags, user_tags):
    '''
    Arguments
    N: songs-to-tags matrix
    tags: dict with tags and column indices
    user_tags: list with tags specified by user
    
    Return
    teleport_set: set with all songs containing user_tags
    '''
    tag_ids = [tags[t] for t in user_tags] # map user specified tags to the corresponding tag ids
    teleport_set = N.getcol(tag_ids[0]).nonzero()[0]
    for tid in tag_ids:
        teleport_set = np.intersect1d(teleport_set, N.getcol(tid).nonzero()[0])
    return teleport_set

Topic-Speific PageRank


In [6]:
def compute_topic_specific_page_rank(M, teleport_set, iterations=50, beta=0.8):
    '''
    Arguments
    M: stochastic colummn matrix
    teleport_set: list with all songs which belong to the users topics
    iterations: number of iterations for pagerank
    beta: 1-teleport probability
    
    Returns
    r: ndarray with the stationary probability distribution
    '''
    e = np.zeros((M.shape[0],1))
    e[teleport_set, :] = 1.0/teleport_set.shape[0]
    r = np.copy(e) # initialize for faster convergence
    for iter in range(iterations):
        r = beta*M*r + (1-beta)*e
    del e
    return r

In [7]:
def get_top_songs(r, tid_to_ids, n=5):
    '''
    Arguments
    r: ndarray with the stationary probability distribution
    tid_to_ids: dict where keys are track ids and values are the row indices from stochastic matrix
    
    Returns
    tracks: list of track ids
    '''
    tracks = []
    top_k = np.argsort(r, axis=0)[-n:]
    ids_to_tid = {v: k for k, v in tid_to_ids.items()}
    for i in range(top_k.shape[0]):
        tracks.append(ids_to_tid[top_k.item(i)])
    return tracks

Main Program for Topic Specific PageRank

Ensure that all previous methods have run before running main program


In [8]:
'''
Parameters of our model
'''
basedir=os.path.abspath('lastfm_subset')
user_tags = ['rock']
t = 0
g = 50
iterations=50
beta=0.8
n=5

In [11]:
def main():
    '''
    Main program, which runs topic specific pagerank on the given dataset and above parameters
    '''
    files, filenames = get_all_files(basedir)
    M,from_, _ = get_stochastic_matrix(files, t) # get stochastic matrix and dictionary of tracks to unique ids
    M = M.tocsr() # convert from COO to CSR for slicing
    N, tags_ = get_songs_to_tags_matrix(files, from_, g) # get songs-to-tags matrix using dictionary of tracks-to-unique-ids
    teleport_set = get_teleport_set(N, tags_, user_tags) # get teleport set using tags
    output = get_top_songs(compute_topic_specific_page_rank(M, teleport_set, iterations, beta), from_, n)
    titles = []
    for item in output:
        with open(os.path.join(filenames[item],item+'.json'), 'r') as f:
                song = json.load(f)
                title = song['title']
                titles.append(title)
    return(titles)

In [12]:
main()


Out[12]:
['Fire', 'Here Without You', 'Do You Want To', 'Contagious', 'Blue Orchid']